Invasion hologenomics - Data Summary

Load libraries

Load data

load("data/squirrels_data.Rdata")

Sample summary

Summary of sampled individuals and analysed faecal samples.

#number of samples
length(sample_metadata$sample)
## [1] 190
#number of samples by species
sample_metadata %>%
  group_by(species) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_ezcd7fufmfwi8ypfcfiy
species n_samples
Sciurus carolinensis 80
Sciurus vulgaris 110
#number of samples by species and sex
sample_metadata %>%
  group_by(species, sex) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_nr2svlmltxsjiznzao6x
species sex n_samples
Sciurus carolinensis F 44
Sciurus carolinensis M 36
Sciurus vulgaris F 63
Sciurus vulgaris M 47
#number of samples by species and development
sample_metadata %>%
  group_by(species, development) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_grw6emx3fcmxdassfbwc
species development n_samples
Sciurus carolinensis Adult 71
Sciurus carolinensis Juvenile 2
Sciurus carolinensis Nursing 3
Sciurus carolinensis Pregnant 4
Sciurus vulgaris Adult 90
Sciurus vulgaris Juvenile 1
Sciurus vulgaris Nursing 8
Sciurus vulgaris Pregnant 11
#number of samples by species and type of area
sample_metadata %>%
  group_by(species,area_type) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_pr13s0g5sxf0yhbz3bw7
species area_type n_samples
Sciurus carolinensis rural 29
Sciurus carolinensis suburban 24
Sciurus carolinensis urban 27
Sciurus vulgaris rural 37
Sciurus vulgaris suburban 30
Sciurus vulgaris urban 43
#number of distinct squirrels
n_distinct(sample_metadata$animal)
## [1] 108
#number of squirrels by species and type of area
sample_metadata %>%
  group_by(species,area_type) %>%
  summarise(distinct_squirrels = n_distinct(animal)) %>%
  tt()
tinytable_tffo1nxnoyxbde3f4ygm
species area_type distinct_squirrels
Sciurus carolinensis rural 14
Sciurus carolinensis suburban 13
Sciurus carolinensis urban 18
Sciurus vulgaris rural 21
Sciurus vulgaris suburban 14
Sciurus vulgaris urban 28
#number of squirrels by species and season
sample_metadata %>%
  group_by(species,season) %>%
  summarise(distinct_squirrels = n_distinct(animal)) %>%
  tt()
tinytable_ivpi2c16be23bmllvafg
species season distinct_squirrels
Sciurus carolinensis autumn 33
Sciurus carolinensis spring-summer 22
Sciurus carolinensis winter 25
Sciurus vulgaris autumn 39
Sciurus vulgaris spring-summer 38
Sciurus vulgaris winter 33
#n of analysed faecal samples
ncol(read_counts)
## [1] 191

Geographical location of sampled red squirrel (light blue) and grey squirrel (pink) populations in Italy.

#Summarise for generating map
options(dplyr.summarise.inform = FALSE)
sample_metadata_summary <- sample_metadata %>%
  #Group by geography and count samples
  select(sample, latitude, longitude, country, species) %>%
  group_by(latitude, longitude, species) %>%
  summarize(count = n()) %>%
  ungroup()

italy <- map_data("world", region="italy") %>%
  summarise(long = mean(long), lat = mean(lat))

#plotting on map
sample_metadata_summary %>%
  ggplot(.) +
  #render map
  geom_map(
    data=map_data("world", region="italy"),
    map = map_data("world", region="italy"),
    aes(long, lat, map_id=region),
    color = "white", fill = "#cccccc", linewidth = 0.2
  ) +
  #render points
  geom_point(
    aes(x=longitude,y=latitude, color=species),
    alpha=0.5, shape=16) +
  #add general plot layout
  theme_minimal() +
  theme(legend.position = "none",
        axis.title.x=element_blank(),
        axis.title.y=element_blank()
  ) + coord_map("mercator")

Sequencing data summary

Total amount of sequencing data generated from the analysed samples.

#amount of discarded data (GB)
sum(round(((sample_metadata$metagenomic_bases+sample_metadata$host_bases)/
             (1-sample_metadata$bases_lost_fastp_percent))-
            (sample_metadata$metagenomic_bases+sample_metadata$host_bases)))/1000000000
## [1] 63.90045
#amount of host data (GB)
sum(sample_metadata$host_bases)/1000000000
## [1] 166.0275
#amount of metagenomic data (GB)
sum(sample_metadata$metagenomic_bases)/1000000000
## [1] 786.1584
#amount of estimated prokaryotic data (singleM)
sum(sample_metadata$metagenomic_bases * sample_metadata$singlem_fraction)/1000000000
## [1] 557.7475

Origin of DNA sequences obtained from each sample.

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
  left_join(sample_metadata, by = join_by(sample == sample))  %>%
  select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
  mutate(mags_bases = mags*146) %>%
  mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
  mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
  mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
  select(sample,mags_bases,unmapped_bases,host_bases,lowqual_bases)

mags_bases_mean <- sequence_fractions %>%
  mutate(mags_bases = mags_bases / 1000000000) %>%
  select(mags_bases) %>%
  pull() %>%
  mean()

sequence_fractions %>%
  pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
  mutate(value = value / 1000000000) %>%
  mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  ggplot(., aes(x = sample, y = value, fill=fraction)) +
  geom_bar(position="stack", stat = "identity") +
  scale_fill_manual(values=c("#CCCCCC","#178a94","#ee8080","#d03161")) +
  geom_hline(yintercept = mags_bases_mean, linetype = "dashed", color = "black") +
  labs(x = "Samples", y = "Amount of data (GB)") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

MAGs summary

#number of MAGs
nrow(read_counts)
## [1] 1687
#number of MAGs without species-level annotation (i.e., "new species")
genome_metadata %>%
  filter(species == "s__") %>%
  nrow()
## [1] 1455
#number of phylums
genome_metadata %>%
  select(phylum) %>%
  unique() %>%
  pull() %>%
  length()
## [1] 13